import sys
import multiprocessing
import re
import os

try:
    #Python 3
    import urllib.request as lib
    python3 = True
except Exception:
    #Python 2
    import urllib as lib
    python3 = False
def craw_links(url, depth, keywords, processed):
    '''url:the url to craw
       depth:the current depth to craw
       keywords:the tuple of keywords to focus
       pool:process pool
    '''
    contents = []
    if url.startswith('http://') or url.startswith('https://'):        
        if url not in processed:
            #mark this url as processed
            processed.append(url)
        else:
            #avoid processing the same url again
            return
        print('Crawing '+url+'...')
        fp = lib.urlopen(url)
        if python3:            
            #Python3 returns bytes, so need to decode. 
            contents = fp.read()
            contents_decoded = contents.decode('UTF-8')
        else:
            #Python2 returns str, does not need this decode
            contents_decoded = fp.read()
        fp.close()
        pattern = '|'.join(keywords)
        #if this page contains certain keywords, save it to a file
        flag = False
        if pattern:
            searched = re.search(pattern, contents_decoded)
        else:
            #if the keywords to filter is not given, save current page
            flag = True
        print(flag, searched)
        if flag or searched:
            if python3:
                with open('craw\\'+url.replace(':','_').replace('/','_'), 'wb') as fp:
                    fp.write(contents)
            else:
                with open('craw\\'+url.replace(':','_').replace('/','_'), 'w') as fp:
                    fp.write(contents_decoded)
        #find all the links in the current page
        links = re.findall('href="(.*?)"', contents_decoded)
        #craw all links in the current page
        for link in links:
            #consider the relative path
            if not link.startswith(('http://','https://')):                
                try:
                    index = url.rindex('/')
                    link = url[0:index+1]+link
                except:
                    pass
                
            if depth>0 and link.endswith(('.htm','.html')):
                craw_links(link, depth-1, keywords, processed)
                
if __name__=='__main__':   
    processed = []   
    keywords = ('KeyWord1','KeyWord2')
    if not os.path.exists('craw') or not os.path.isdir('craw'):
        os.mkdir('craw')
    craw_links(r'https://docs.python.org/3/library/index.html', 1, keywords, processed)